Startup
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
✔ ggplot2 3.3.6 ✔ purrr 0.3.4
✔ tibble 3.1.4 ✔ dplyr 1.0.7
✔ tidyr 1.1.3 ✔ stringr 1.4.0
✔ readr 2.0.1 ✔ forcats 0.5.1
── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(cowplot)
library(ggpmisc)
Loading required package: ggpp
Attaching package: ‘ggpp’
The following object is masked from ‘package:ggplot2’:
annotate
library(here)
here() starts at /labs/khatrilab/solomonb/hla_project/hla_benchmark
source(here("helper_functions/data_import_functions.R"))
source(here("helper_functions/figure_format_functions.R"))
Loading required package: flextable
Registered S3 method overwritten by 'data.table':
method from
print.data.table
Registered S3 methods overwritten by 'htmltools':
method from
print.html tools:rstudio
print.shiny.tag tools:rstudio
print.shiny.tag.list tools:rstudio
Attaching package: ‘flextable’
The following object is masked from ‘package:purrr’:
compose
source(here("helper_functions/calculation_functions.R"))
Loading required package: tidymodels
Registered S3 method overwritten by 'tune':
method from
required_pkgs.model_spec parsnip
── Attaching packages ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.1.3 ──
✔ broom 0.7.9 ✔ rsample 0.1.0
✔ dials 0.0.9 ✔ tune 0.1.6
✔ infer 1.0.0 ✔ workflows 0.2.3
✔ modeldata 0.1.1 ✔ workflowsets 0.1.0
✔ parsnip 0.1.7 ✔ yardstick 0.0.8
✔ recipes 0.1.16
── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
✖ ggpp::annotate() masks ggplot2::annotate()
✖ flextable::compose() masks purrr::compose()
✖ scales::discard() masks purrr::discard()
✖ dplyr::filter() masks stats::filter()
✖ recipes::fixed() masks stringr::fixed()
✖ dplyr::lag() masks stats::lag()
✖ yardstick::spec() masks readr::spec()
✖ recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
Loading required package: checkmate
Attaching package: ‘checkmate’
The following object is masked from ‘package:recipes’:
check_class
Loading required package: lubridate
Attaching package: ‘lubridate’
The following object is masked from ‘package:cowplot’:
stamp
The following objects are masked from ‘package:base’:
date, intersect, setdiff, union
isb_path <- here("data/isb")
isb_samples <- read_tsv(here("data/isb/logs/parallel.log")) %>%
separate(Command, into = c(NA, "sample"), sep = " ") %>%
pull(sample) %>% unique()
Rows: 294 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): Host, Command
dbl (7): Seq, Starttime, JobRuntime, Send, Receive, Exitval, Signal
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Get reference alleles
reference_alleles <- read_tsv(here("references/GCh38_reference_alleles.txt"), col_names = F) %>% pull(1)
Rows: 26 Columns: 1
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (1): X1
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
genome_region_reference <- get_allele_length(reference_alleles)
Get arcasHLA alignment stats
arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)
Import cell counts
cell_counts_df <- readRDS(here("data/isb/isb_sequenced_cell_count.RDS"))
Assemble accuracy DF
success_df <- readRDS(here("2_accuracy/isb_success.RDS"))
accuracy_df <- readRDS(here("3_DRB/isb_accuracy_drb345_filtered.RDS"))
accuracy_df <- accuracy_df %>%
left_join(
success_df %>% select(sample:field, genotyper, success = accuracy),
by = c("sample", "locus", "field", "genotyper")
)%>%
left_join(alignment_stats_df, by = c("sample", "locus")) %>%
left_join(genome_region_reference, by = "locus") %>%
left_join(cell_counts_df, by = "sample") %>%
mutate_at(vars(c(reads, bp)), as.numeric) %>%
mutate(coverage = (reads * 91)/bp) %>%
mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer")
Coverage based plots
gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number based plots
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

Subsample READS
Import read subsample data
# Import predicted genotypes
subsample_reads_path <- here("data/isb/subsample")
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>%
filter(grepl("^INCOV", sample)) %>%
separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>%
distinct() %>%
pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_reads_df <- subsample_reads_df %>%
select(sample) %>%
distinct() %>%
separate(sample, into = c("sample", "subsample"), sep = "_") %>%
left_join(invitro_df, by = "sample") %>%
unite(sample, sample, subsample, sep = "_") %>%
format_hla_table() %>%
drop_na()
invitro_reads_df
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)
Calculate accuracy
# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
hla_df = bind_rows(
subsample_reads_df,
invitro_reads_df
),
reference = "invitro",
method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, "isb_subsample_reads_accuracy.RDS")
# Calculate success
subsample_reads_success_df <- compare_hla(
hla_df = bind_rows(
subsample_reads_df,
invitro_reads_df
),
reference = "invitro",
method = "success"
)
# saveRDS(subsample_reads_success_df, "isb_subsample_reads_success.RDS")
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>%
left_join(
subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
by = c("sample", "locus", "field", "genotyper")
)%>%
left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>%
left_join(genome_region_reference, by = "locus") %>%
bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
mutate_at(vars(c(reads, bp)), as.numeric) %>%
mutate(coverage = (reads * 91)/bp) %>% # Calculate coverage
mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% # Calculate n_alleles
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer")
# saveRDS(subsample_reads_combined_df, "subsample_reads_combined_df.RDS")
# subsample_reads_combined_df <- readRDS("subsample_reads_combined_df.RDS")
Reads plots
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy")
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy")
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success")
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)

plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
- Default arcasHLA cut off 75 reads (log10 = 1.8)
subsample_reads_combined_df %>%
filter(field == "field_2") %>%
mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>%
ggplot(aes(y = n_alleles, x = log10(coverage)))+
# geom_violin(scale = "count", alpha = 0.5, )+
geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
# geom_boxplot(width=0.1)+
# ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
# coord_flip()+
facet_grid(genotyper ~ locus, scales = "free")+
theme_bw()

Subsample CELLS
# Import predicted genotypes
subsample_cells_path <- here("data/isb/subsample_cells")
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>%
filter(grepl("^INCOV", sample)) %>%
separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>%
distinct() %>%
pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
Rows: 1557 Columns: 9
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (8): Sample ID, Locus, Allele 1, Allele 2, Comments, Diploid Ambiguities, Allele 1 Ambiguities, Allele 2 Ambiguities
dbl (1): index
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
invitro_cells_df <- subsample_cells_df %>%
select(sample) %>%
distinct() %>%
separate(sample, into = c("sample", "subsample"), sep = "_") %>%
left_join(invitro_df, by = "sample") %>%
unite(sample, sample, subsample, sep = "_") %>%
format_hla_table() %>%
drop_na()
invitro_cells_df
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
# Import cell counts for each subsample
subsample_cell_counts_df <- read_csv(here("data/isb/subsample_cells/subsample_cell_counts.csv"))
Rows: 1176 Columns: 2
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (1): sample
dbl (1): n_cells
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Calculate accuracy
# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
hla_df = bind_rows(
subsample_cells_df,
invitro_cells_df
),
reference = "invitro",
method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, "isb_subsample_cells_accuracy.RDS")
# subsample_cells_accuracy_df <- readRDS("isb_subsample_cells_accuracy.RDS")
# Calculate success
subsample_cells_success_df <- compare_hla(
hla_df = bind_rows(
subsample_cells_df,
invitro_cells_df
),
reference = "invitro",
method = "success"
)
# saveRDS(subsample_cells_success_df, "isb_subsample_cells_success.RDS")
# subsample_cells_success_df <- readRDS("isb_subsample_cells_success.RDS")
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>%
left_join(
subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
by = c("sample", "locus", "field", "genotyper")
)%>%
left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>%
left_join(genome_region_reference, by = "locus") %>%
left_join(subsample_cell_counts_df, by = "sample") %>%
bind_rows(accuracy_df) %>%
mutate_at(vars(c(reads, bp)), as.numeric) %>%
mutate(coverage = (reads * 91)/bp) %>%
mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%
filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer")
# saveRDS(subsample_cells_combined_df, "subsample_cells_combined_df.RDS")
Coverage plots
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)

Cell number plots
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy")
`geom_smooth()` using formula 'y ~ x'

plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy")
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy")
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success")
`geom_smooth()` using formula 'y ~ x'

plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success")
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success")
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)

plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
# subsample_cells_combined_df %>%
# mutate(reads = reads/n_cells) %>%
# filter(reads < 100) %>%
# gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F)
Combined plots
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
plot_grid(
plt_read_accuracy_abc + no_leg,
plt_read_success_abc + no_leg,
plt_read_allele_abc,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[1:3]
),
plot_grid(
plt_cell_accuracy_abc + no_leg,
plt_cell_success_abc + no_leg,
plt_cell_allele_abc,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[4:6]
),
plot_grid(
legend,
legend,
NA,
ncol = 1
),
nrow = 1,
rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
`geom_smooth()` using formula 'y ~ x'
plot_grid(
plot_grid(
plt_read_accuracy_all + no_leg,
plt_read_success_all + no_leg,
plt_read_allele_all,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[1:3]
),
plot_grid(
plt_cell_accuracy_all + no_leg,
plt_cell_success_all + no_leg,
plt_cell_allele_all,
ncol = 1,
align = "v",
axis = "lr",
rel_heights = c(1,1,1.2),
labels = LETTERS[4:6]
),
plot_grid(
legend,
legend,
NA,
ncol = 1
),
nrow = 1,
rel_widths = c(1,1,0.25)
)
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'
`geom_smooth()` using formula 'y ~ x'

Main plot
loci <- c("A", "B", "C", "DPB1", "DQA1", "DRB1")
plt_read_accuracy_subset <- gg_coverage(
subsample_reads_combined_df %>%
filter(locus %in% loci) %>%
filter(!(genotyper == "phlat" & locus == "DPB1")) %>%
filter(!(genotyper == "optitype" & locus == "DPB1")) %>%
filter(!(genotyper == "optitype" & locus == "DQA1")) %>%
filter(!(genotyper == "optitype" & locus == "DRB1")),
x_var = "reads", y_var = "accuracy") +
facet_wrap(~locus, nrow = 2)
plt_read_accuracy_subset
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 59 rows containing non-finite values (stat_smooth).
Warning: Removed 59 rows containing missing values (geom_point).

save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_read_accuracy_subset, base_height = 4, base_width = 6)
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df %>%
filter(locus %in% loci) %>%
filter(!(genotyper == "phlat" & locus == "DPB1")) %>%
filter(!(genotyper == "optitype" & locus == "DPB1")) %>%
filter(!(genotyper == "optitype" & locus == "DQA1")) %>%
filter(!(genotyper == "optitype" & locus == "DRB1")), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all
Warning: Removed 62 rows containing missing values (geom_point).

plt_main <- plot_grid(
plt_read_accuracy_subset,
plt_read_allele_all,
ncol = 1,
rel_heights = c(3,2),
align = "v",
axis = "lr",
labels = c("A", "B")
)
`geom_smooth()` using formula 'y ~ x'
Warning: Removed 59 rows containing non-finite values (stat_smooth).
Warning: Removed 59 rows containing missing values (geom_point).
Warning: Removed 62 rows containing missing values (geom_point).
plt_main

save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_main, base_height = 7, base_width = 7)
Slope stats
get_lm_stats <- function(df, x_var = "coverage", y_var = "accuracy", field_var = "field_2", x_log = T){
# Input checks
arg_col <- makeAssertCollection()
assertChoice(y_var, c("accuracy", "success", "n_alleles"), add = arg_col)
assertChoice(x_var, c("reads", "coverage", "n_cells"), add = arg_col)
assertChoice(field_var, c("field_1", "field_2", "field_3"), add = arg_col)
assertLogical(x_log, add = arg_col)
if (arg_col$isEmpty()==F) {map(arg_col$getMessages(),print);reportAssertions(arg_col)}
# browser()
df <- df %>%
filter(field == field_var) %>%
exclude_genotyper_fields() # Replace w/ exclude_genotyper_fields?
if (x_log ==T){df <- df %>% mutate_at(x_var, log10)}
df %>%
filter_at(x_var, function(x) !is.na(x) & !is.infinite(x)) %>%
group_by(locus, field, genotyper) %>%
nest() %>%
mutate(data = map(data, function(x){
summary(lm(!!sym(y_var) ~ !!sym(x_var), data = x)) %>% broom::tidy() %>% filter(term == x_var)
})) %>% unnest_wider(data) %>%
mutate(est_min = round(estimate - 2*std.error, 2),
est_max = round(estimate + 2*std.error, 2))
}
model_stats <- get_lm_stats(subsample_cells_combined_df, x_var = "reads", y_var = "accuracy", x_log = T)
model_stats
model_stats <- bind_rows(
get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
) %>%
# filter(grepl("^[ABC]", locus)) %>%
ungroup() %>%
filter(field == "field_2")
Table
| Reads | Cells |
| A | B | C | DPA1 | DPB1 | DQA1 | DQB1 | DRB1 | A | B | C | DPA1 | DPB1 | DQA1 | DQB1 | DRB1 |
Accuracy | arcasHLA | (0.09) - (0.14) | (0.09) - (0.14) | (0.11) - (0.15) | (0.26) - (0.29) | (0.23) - (0.28) | (0.26) - (0.3) | (0.23) - (0.27) | (0.19) - (0.24) | (0.04) - (0.1) | (0.05) - (0.11) | (0.06) - (0.11) | (0.29) - (0.34) | (0.26) - (0.32) | (0.3) - (0.35) | (0.26) - (0.31) | (0.18) - (0.24) |
OptiType | (-0.05) - (0) | (-0.03) - (0.02) | (-0.01) - (0.03) | --- | --- | --- | --- | --- | (-0.06) - (0) | (-0.04) - (0.01) | (-0.04) - (0.02) | --- | --- | --- | --- | --- |
PHLAT | (0.05) - (0.1) | (0.07) - (0.12) | (0.08) - (0.12) | --- | --- | (0.18) - (0.22) | (0.18) - (0.22) | (0.13) - (0.18) | (0.04) - (0.09) | (0.06) - (0.11) | (0.05) - (0.1) | --- | --- | (0.23) - (0.28) | (0.21) - (0.26) | (0.15) - (0.2) |
Success | arcasHLA | (-0.07) - (0.01) | (-0.01) - (0.07) | (-0.01) - (0.07) | (0.04) - (0.11) | (-0.01) - (0.09) | (-0.04) - (0.06) | (-0.04) - (0.08) | (-0.04) - (0.03) | (-0.01) - (0.06) | (0.01) - (0.09) | (-0.01) - (0.06) | (0.1) - (0.17) | (0.04) - (0.14) | (0.04) - (0.14) | (0.01) - (0.12) | (-0.02) - (0.05) |
OptiType | (0.01) - (0.06) | (0.03) - (0.08) | (0.05) - (0.09) | --- | --- | --- | --- | --- | (0.01) - (0.07) | (0.02) - (0.08) | (0.03) - (0.08) | --- | --- | --- | --- | --- |
PHLAT | (0.05) - (0.1) | (0.08) - (0.13) | (0.08) - (0.12) | --- | --- | (0.12) - (0.18) | (0.11) - (0.17) | (0.12) - (0.17) | (0.04) - (0.1) | (0.06) - (0.13) | (0.05) - (0.11) | --- | --- | (0.13) - (0.21) | (0.12) - (0.18) | (0.12) - (0.18) |
Scratch work
---
title: "5) Coverage Analysis"
output: 
  html_notebook:
    toc: true
    toc_depth: 3
    toc_float: true
    code_folding: hide
---


# Startup 

```{r, message = F}
library(tidyverse)
library(cowplot)
library(ggpmisc)
library(here)

source(here("helper_functions/data_import_functions.R"))
source(here("helper_functions/figure_format_functions.R"))
source(here("helper_functions/calculation_functions.R"))
isb_path <- here("data/isb")
isb_samples <- read_tsv(here("data/isb/logs/parallel.log")) %>% 
  separate(Command, into = c(NA, "sample"), sep = " ") %>% 
  pull(sample) %>% unique()
```

### Get reference alleles

- GCh38 reference alleles (per https://www.ebi.ac.uk/ipd/imgt/hla/help/genomics.html)
- To be used to calculate coverage

```{r, message = F}
reference_alleles <- read_tsv(here("references/GCh38_reference_alleles.txt"), col_names = F) %>% pull(1)
genome_region_reference <- get_allele_length(reference_alleles)
```

### Get arcasHLA alignment stats

```{r}
arcas_log_dir <- sprintf("%s/arcasHLA", isb_path)
alignment_stats_df <- hla_mapping_stats_import(isb_samples, arcas_log_dir)
```

### Import cell counts

```{r}
cell_counts_df <- readRDS(here("data/isb/isb_sequenced_cell_count.RDS"))
```

### Assemble accuracy DF

```{r}
success_df <- readRDS(here("2_accuracy/isb_success.RDS"))
accuracy_df <- readRDS(here("3_DRB/isb_accuracy_drb345_filtered.RDS"))
accuracy_df <- accuracy_df %>% 
  left_join(
    success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(cell_counts_df, by = "sample") %>%
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)$", sample) & genotyper != "hlaminer") 
```


### Coverage based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number based plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(accuracy_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


# Subsample READS

### Import read subsample data

```{r}
# Import predicted genotypes
subsample_reads_path <- here("data/isb/subsample")
subsample_reads_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_reads_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_reads_df <- format_hla_table(combine_HLA_import(path = subsample_reads_path, samples = subsample_reads_samples))
subsample_reads_df <- subsample_reads_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_reads_df
```

```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_reads_df <- subsample_reads_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_reads_df
```

```{r}
# Import read statistics
subsample_reads_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_reads_path)
subsample_reads_alignment_stats_df <- hla_mapping_stats_import(subsample_reads_samples, subsample_reads_arcas_log_dir)
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_reads_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_reads_accuracy_df, here("5_coverage_accuracy/isb_subsample_reads_accuracy.RDS"))
```


```{r}
# Calculate success
subsample_reads_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_reads_df,
    invitro_reads_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_reads_success_df, here("5_coverage_accuracy/isb_subsample_reads_success.RDS"))
```

```{r}
# Combine various data
subsample_reads_combined_df <- subsample_reads_accuracy_df %>% 
  left_join(
    subsample_reads_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_reads_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  bind_rows(accuracy_df %>% select(-n_cells)) %>% # No cell numbers for read subsamples
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>%  # Calculate coverage
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>%  # Calculate n_alleles
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
# saveRDS(subsample_reads_combined_df, here("5_coverage_accuracy/subsample_reads_combined_df.RDS"))
# subsample_reads_combined_df <- readRDS(here("5_coverage_accuracy/subsample_reads_combined_df.RDS"))
```

### Reads plots
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy") 
plt_read_accuracy_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "success")
plt_read_success_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df, x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_abc <- gg_coverage(subsample_reads_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
```
- Default arcasHLA cut off 75 reads (log10 = 1.8)

```{r, fig.width=12, fig.height=3, warning = F, message = F}
subsample_reads_combined_df %>% 
  filter(field == "field_2") %>% 
  mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>% 
  ggplot(aes(y = n_alleles, x = log10(coverage)))+
  # geom_violin(scale = "count", alpha = 0.5, )+
  geom_jitter(size = 0.2, width = 0, height =0.2, alpha = 0.2)+
  # geom_boxplot(width=0.1)+
  # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+
  # coord_flip()+
  facet_grid(genotyper ~ locus, scales = "free")+
  theme_bw()
```

# Subsample CELLS

```{r}
# Import predicted genotypes
subsample_cells_path <- here("data/isb/subsample_cells")
subsample_cells_samples <- tibble(sample = list.files(sprintf("%s/arcasHLA", subsample_cells_path))) %>% 
  filter(grepl("^INCOV", sample)) %>% 
  separate(sample, into = c("sample"), sep = "\\.", extra = "drop") %>% 
  distinct() %>% 
  pull(sample)
subsample_cells_df <- format_hla_table(combine_HLA_import(path = subsample_cells_path, samples = subsample_cells_samples))
subsample_cells_df <- subsample_cells_df %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)_", sample))
subsample_cells_df
```


```{r, message = F}
# Expand invitro genotypes across read subsample names
invitro_df <- invitro_import()
invitro_cells_df <- subsample_cells_df %>% 
  select(sample) %>% 
  distinct() %>% 
  separate(sample, into = c("sample", "subsample"), sep = "_") %>% 
  left_join(invitro_df, by = "sample") %>% 
  unite(sample, sample, subsample, sep = "_") %>% 
  format_hla_table() %>% 
  drop_na()
invitro_cells_df
```

```{r}
# Import read statistics
subsample_cells_arcas_log_dir <- sprintf("%s/arcasHLA", subsample_cells_path)
subsample_cells_alignment_stats_df <- hla_mapping_stats_import(subsample_cells_samples, subsample_cells_arcas_log_dir)
```

```{r}
# Import cell counts for each subsample
subsample_cell_counts_df <- read_csv(here("data/isb/subsample_cells/subsample_cell_counts.csv"))
```

### Calculate accuracy

```{r}
# Calculate accuracy
subsample_cells_accuracy_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "accuracy"
)
# saveRDS(subsample_cells_accuracy_df, here("5_coverage_accuracy/isb_subsample_cells_accuracy.RDS"))
# subsample_cells_accuracy_df <- readRDS(here("5_coverage_accuracy/isb_subsample_cells_accuracy.RDS"))
```

```{r}
# Calculate success
subsample_cells_success_df <- compare_hla(
  hla_df = bind_rows(
    subsample_cells_df,
    invitro_cells_df
  ),
  reference = "invitro", 
  method = "success"
)
# saveRDS(subsample_cells_success_df, here("5_coverage_accuracy/isb_subsample_cells_success.RDS"))
# subsample_cells_success_df <- readRDS(here("5_coverage_accuracy/isb_subsample_cells_success.RDS"))
```

```{r}
# Combine various data
subsample_cells_combined_df <- subsample_cells_accuracy_df %>% 
  left_join(
    subsample_cells_success_df %>% select(sample:field, genotyper, success = accuracy),
    by = c("sample", "locus", "field", "genotyper")
  )%>% 
  left_join(subsample_cells_alignment_stats_df, by = c("sample", "locus")) %>% 
  left_join(genome_region_reference, by = "locus") %>%
  left_join(subsample_cell_counts_df, by = "sample") %>%
  bind_rows(accuracy_df) %>% 
  mutate_at(vars(c(reads, bp)), as.numeric) %>% 
  mutate(coverage = (reads * 91)/bp) %>% 
  mutate(n_alleles = map_dbl(allele, function(x) sum(!is.na(x)))) %>% 
  filter(grepl("^[ABC]|D.[AB]1", locus) & grepl("-(AC|BL)", sample) & genotyper != "hlaminer") 
# saveRDS(subsample_cells_combined_df, here("5_coverage_accuracy/subsample_cells_combined_df.RDS"))
```

### Coverage plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "success")
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "coverage", y_var = "n_alleles", facet_genotyper = T)
```

### Cell number plots

```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy") 
plt_cell_accuracy_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "accuracy") 
```
```{r, fig.width=12, fig.height=2.5, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "success") 
plt_cell_success_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "success") 
```
```{r, fig.width=12, fig.height=3, warning = F, message = F}
gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_all <- gg_coverage(subsample_cells_combined_df, x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
plt_cell_allele_abc <- gg_coverage(subsample_cells_combined_df %>% filter(grepl("^[ABC]", locus)), x_var = "n_cells", y_var = "n_alleles", facet_genotyper = T)
```


```{r, fig.width=12, fig.height=3, warning = F, message = F}
# subsample_cells_combined_df %>% 
#   mutate(reads = reads/n_cells) %>% 
#   filter(reads < 100) %>% 
#   gg_coverage(x_var = "reads", y_var = "accuracy", x_log = F) 
```








# Combined plots
```{r, fig.width = 9, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_abc)
plot_grid(
  plot_grid(
    plt_read_accuracy_abc + no_leg,
    plt_read_success_abc + no_leg,
    plt_read_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_abc + no_leg,
    plt_cell_success_abc + no_leg,
    plt_cell_allele_abc,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

```{r, fig.width = 18, fig.height = 7, message = F, warning = F}
no_leg <- list(theme(legend.position = "none"))
legend <- cowplot::get_legend(plt_read_accuracy_all)
plot_grid(
  plot_grid(
    plt_read_accuracy_all + no_leg,
    plt_read_success_all + no_leg,
    plt_read_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[1:3]
  ),
  plot_grid(
    plt_cell_accuracy_all + no_leg,
    plt_cell_success_all + no_leg,
    plt_cell_allele_all,
    ncol = 1,
    align = "v",
    axis = "lr",
    rel_heights = c(1,1,1.2),
    labels = LETTERS[4:6]
  ),
  plot_grid(
    legend,
    legend,
    NA,
    ncol = 1
  ),
  nrow = 1,
  rel_widths = c(1,1,0.25)
)
```

### Main plot

```{r}
loci <- c("A", "B", "C", "DPB1", "DQA1", "DRB1")
plt_read_accuracy_subset <- gg_coverage(
  subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), 
  x_var = "reads", y_var = "accuracy") +
  facet_wrap(~locus, nrow = 2)
plt_read_accuracy_subset
```
```{r}
save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_read_accuracy_subset, base_height = 4, base_width = 6)
```

```{r}
plt_read_allele_all <- gg_coverage(subsample_reads_combined_df %>% 
    filter(locus %in% loci) %>% 
    filter(!(genotyper == "phlat" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DPB1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DQA1")) %>% 
    filter(!(genotyper == "optitype" & locus == "DRB1")), x_var = "reads", y_var = "n_alleles", facet_genotyper = T)
plt_read_allele_all
```
```{r, fig.width = 7, fig.height=7}
plt_main <- plot_grid(
  plt_read_accuracy_subset,
  plt_read_allele_all,
  ncol = 1,
  rel_heights = c(3,2),
  align = "v",
  axis = "lr",
  labels = c("A", "B")
)
plt_main
```
```{r}
save_plot(here("figures_pdf/v2/5_coverage.pdf"), plt_main, base_height = 7, base_width = 7)
```

# Slope stats

```{r}
get_lm_stats <- function(df, x_var = "coverage", y_var = "accuracy", field_var = "field_2", x_log = T){
  # Input checks
  arg_col <- makeAssertCollection()
  assertChoice(y_var, c("accuracy", "success", "n_alleles"), add = arg_col)
  assertChoice(x_var, c("reads", "coverage", "n_cells"), add = arg_col)
  assertChoice(field_var, c("field_1", "field_2", "field_3"), add = arg_col)
  assertLogical(x_log, add = arg_col)
  if (arg_col$isEmpty()==F) {map(arg_col$getMessages(),print);reportAssertions(arg_col)}
  
  # browser()
  df <- df %>% 
    filter(field == field_var) %>% 
    exclude_genotyper_fields() # Replace w/ exclude_genotyper_fields?
  if (x_log ==T){df <- df %>% mutate_at(x_var, log10)}
  df %>%
  filter_at(x_var, function(x) !is.na(x) & !is.infinite(x)) %>% 
  group_by(locus, field, genotyper) %>%
  nest() %>%
  mutate(data = map(data, function(x){
    summary(lm(!!sym(y_var) ~ !!sym(x_var), data = x)) %>% broom::tidy() %>% filter(term == x_var)
  })) %>% unnest_wider(data) %>%
  mutate(est_min = round(estimate - 2*std.error, 2),
         est_max = round(estimate + 2*std.error, 2))
}

model_stats <- get_lm_stats(subsample_cells_combined_df, x_var = "reads", y_var = "accuracy", x_log = T)
model_stats
```

```{r}
model_stats <- bind_rows(
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "accuracy", x_log = T) %>% mutate(x_var = "reads", y_var = "accuracy"),
  get_lm_stats(subsample_reads_combined_df, x_var = "reads", y_var = "success", x_log = T) %>% mutate(x_var = "reads", y_var = "success"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "accuracy", x_log = T) %>% mutate(x_var = "cells", y_var = "accuracy"),
  get_lm_stats(subsample_cells_combined_df, x_var = "n_cells", y_var = "success", x_log = T) %>% mutate(x_var = "cells", y_var = "success")
)  %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") 
```



# Table
```{r}
col_vars <- c("x_var", "locus")
row_vars <- c("y_var", "genotyper")

flex_df <- model_stats %>% 
  # filter(grepl("^[ABC]", locus)) %>% 
  ungroup() %>% 
  filter(field == "field_2") %>% 
  mutate(CI = sprintf("(%s) - (%s)", est_min, est_max)) %>% 
  mutate(genotyper = reformat_hla_genotyper(genotyper)) %>% 
  select(genotyper, locus, CI, x_var, y_var) %>% 
  pivot_wider(names_from = col_vars, values_from = "CI", names_sep =  "-") 

df_key <- tibble(col_keys = names(flex_df)) %>% 
  filter(!(col_keys %in% c(col_vars, row_vars))) %>% 
  separate(col_keys, into = col_vars, sep = "-", remove = F) %>%
  mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>% 
  arrange_at(col_vars) %>% 
  mutate_all(as.character) %>% 
  mutate_at(col_vars[!(col_vars %in% "locus")], str_to_sentence)

output_table <- flex_df %>% 
  select(row_vars, everything()) %>% 
  mutate_at(row_vars[!(row_vars %in% "genotyper")], str_to_sentence) %>% 
  # select(locus, df_key$col_keys) %>% 
  flextable() %>% 
  colformat_char(na_str = "---") %>%
  merge_v(j=1:2) %>% 
  set_header_df(mapping = df_key, key = "col_keys") %>% 
  merge_h(part = "header") %>%
  theme_vanilla() %>% 
  # vline(j=vlines_sequence, border = fp_border_default()) %>%
  fix_border_issues() %>% 
  autofit() %>% 
  fontsize(size = 8, part = "all") 

print(output_table)
# output_table %>% print(preview = "pptx")
```


<!-- ```{r} -->
<!--   df <- df %>%  -->
<!--     mutate(field = reformat_hla_field(field)) %>%  -->
<!--     mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>%  -->
<!--     mutate(cell_value = ifelse(!is.na(se),  -->
<!--                                sprintf("%s %s %s", round(mean_accuracy,2),"\u00B1",round(se,2)), -->
<!--                                sprintf("%s", round(mean_accuracy,2)))) %>% -->
<!--     mutate(cell_value = ifelse(grepl("NA", cell_value), NA, cell_value)) %>%  -->
<!--     select(-sd,-se, -mean_accuracy) %>%  -->
<!--     pivot_wider(names_from = nesting_vars, values_from = "cell_value", names_sep = "-")  -->

<!--   df_key <- suppressWarnings({tibble(col_keys = names(df)) %>%  -->
<!--       filter(!grepl("locus", col_keys)) %>%  -->
<!--       separate(col_keys, into = nesting_vars, sep = "-", remove = F) %>% -->
<!--       mutate_at(vars(contains("genotyper")), reformat_hla_genotyper) %>%  -->
<!--       arrange_at(nesting_vars) %>%  -->
<!--       mutate_all(as.character) -->
<!--   }) -->

<!--   vline_var <- tail(nesting_vars,1) -->
<!--   vlines_sequence <- seq(1,nrow(df_key),by = length(unique(df_key[[vline_var]]))) -->
<!--   df <- df %>%  -->
<!--     select(locus, df_key$col_keys) %>%  -->
<!--     flextable() %>%  -->
<!--     colformat_char(na_str = "---") %>%  -->
<!--     set_header_df(mapping = df_key, key = "col_keys") %>%  -->
<!--     merge_h(part = "header") %>%  -->
<!--     theme_vanilla() %>%  -->
<!--     vline(j=vlines_sequence, border = fp_border_default()) %>% -->
<!--     fix_border_issues() -->
<!--   return(df) -->
<!-- ``` -->





# Scratch work

<!-- ```{r, fig.width=12, fig.height=3, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   filter(field == "field_2") %>%  -->
<!--   mutate(n_alleles = factor(n_alleles, levels = 0:2)) %>%  -->
<!--   ggplot(aes(x = n_alleles, y = log10(n_cells)))+ -->
<!--   geom_violin(scale = "width")+ -->
<!--   geom_jitter(size = 0.2, height = 0, width =0.05, alpha = 0.2)+ -->
<!--   # geom_boxplot(width=0.1)+ -->
<!--   # ggbeeswarm::geom_beeswarm(size = 0.1, alpha = 0.1)+ -->
<!--   coord_flip()+ -->
<!--   facet_grid(genotyper ~ locus, scales = "free")+ -->
<!--   theme_bw() -->
<!-- ``` -->
<!-- # Coverage stats -->
<!-- ```{r, fig.width=12, fig.height=3, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   ggplot(aes(x = n_cells, y = coverage, color = genotyper)) +  -->
<!--   geom_point() + -->
<!--   facet_wrap(genotyper ~ locus, scales = "free", nrow = 3) +  -->
<!--   theme_bw() + -->
<!--   theme(strip.text = element_blank(), -->
<!--         axis.text.x = element_text(angle = 45, hjust = 1))+ -->
<!--   scale_color_brewer(palette = "Dark2") -->
<!-- ``` -->

<!-- ```{r, fig.width=16, fig.height=3, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   select(-genotyper) %>% distinct() %>%  -->
<!--   mutate(cov_per_cell = coverage/n_cells) %>%  -->
<!--   ggplot(aes(x = log10(n_cells), y = cov_per_cell)) +  -->
<!--   geom_point(color = "black", alpha = 0.1) + -->
<!--   stat_smooth(method = "lm")+ -->
<!--   # stat_smooth()+ -->
<!--     facet_wrap( ~ locus, scales = "free", nrow = 1) + -->
<!--   theme_bw() + -->
<!--   theme(axis.text.x = element_text(angle = 45, hjust = 1))+ -->
<!--   scale_color_brewer(palette = "Dark2") -->
<!-- ``` -->

<!-- ```{r, fig.width=16, fig.height=6, warning = F, message = F} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   select(-genotyper) %>% distinct() %>%  -->
<!--   filter(!is.na(n_cells)) %>%  -->
<!--   mutate(cov_per_cell = coverage/n_cells) %>%  -->
<!--   mutate(n_cells = cut(n_cells, seq(0,10000,by=2500), right = F)) %>%  -->
<!--   ggplot(aes(x = n_cells, y = cov_per_cell)) +  -->
<!--   stat_summary(fun = function(x) mean(x,na.rm=T), geom = "bar") + -->
<!--   # geom_point(color = "black", alpha = 0.1) + -->
<!--   # stat_smooth(method = "lm")+ -->
<!--   # stat_smooth()+ -->
<!--   facet_wrap( ~ locus, scales = "free", nrow = 1) + -->
<!--   theme_bw() + -->
<!--   theme(axis.text.x = element_text(angle = 45, hjust = 1))+ -->
<!--   scale_color_brewer(palette = "Dark2") -->
<!-- ``` -->

<!-- ```{r} -->
<!-- hist(cut(1:100, seq(1,100,10))) -->
<!-- ``` -->

<!-- ```{r} -->
<!-- subsample_cells_combined_df %>%  -->
<!--   ungroup() %>%  -->
<!--   count(genotyper, locus, field) -->
<!-- ``` -->
<!-- ```{r} -->
<!-- subsample_reads_combined_df %>%  -->
<!--   ungroup() %>%  -->
<!--   count(genotyper, locus, field) -->
<!-- ``` -->
